Work showcasing comprehensive analysis and exploration for HF data - OLAP system

- Installed Packages -
!python -m pip install "dask[complete]"
!conda install -c plotly plotly=5.13.1
!conda install -c conda-forge nbformat
!conda install -c anaconda seaborn
# Libraries
import psycopg2
from psycopg2.extras import execute_values
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
# Database Connection
conn = psycopg2.connect(
host="localhost",
database="hospital_olap_db",
user="postgres",
password="123m")
curr = conn.cursor()
# Getting Data
sql_select_data = """
SELECT vf.visit_id,
ppd.pr_id,
hfd.oshpd_id2,
hfd.location_id,
pd.p_id,
ed.emp_id,
ppd.name as procedure_name,
ppd.type as procedure_type,
ppd.price as procedure_price,
hfd.name as health_facility_name,
hfd.city as health_facility_city,
hfd.county as health_facility_county,
hfd.license_type as health_facility_license_type,
pd.name as patient_name,
pd.date_of_birth as patient_dob,
ed.emp_name as employee_name,
ed.section as employee_section,
ed.specialty as employee_specialty,
vf.visit_date,
vf.payment as visit_payment,
vf.rating as visit_rating
FROM VisitFact vf
INNER JOIN PatientProcedureDim ppd ON vf.procedure_dim_id = ppd.pr_id
INNER JOIN HealthFacilityDim hfd ON (vf.health_facility_dim_id_fac=hfd.oshpd_id2) AND (vf.health_facility_dim_id_loc=hfd.location_id)
INNER JOIN PatientDim pd ON vf.patient_dim_id=pd.p_id
INNER JOIN EmployeeDim ed ON vf.employee_dim_id=ed.emp_id;
"""
curr.execute(sql_select_data)
# Defining DataFrame
data_cols = ["visit_id", "pr_id", "oshpd_id2", "location_id", "p_id", "emp_id", "procedure_name", "procedure_type",
"procedure_price", "health_facility_name", "health_facility_city", "health_facility_county",
"health_facility_license_type", "patient_name", "patient_dob", "employee_name", "employee_section",
"employee_specialty", "visit_date", "visit_payment", "visit_rating"]
# pd.DataFrame(data, columns=data_cols).info(memory_usage="deep") # CHECK MEMORY
ddf = dd.from_array(np.array(curr.fetchall()), chunksize=10000, columns=data_cols)
# Closing DB Connection
curr.close()
conn.close()
NOTE: using sample data for data loading issues while running seaborn
sample_data = ddf.sample(frac=0.02).compute()
sample_data = sample_data.reset_index(drop=True)
sample_data.head(3)
| visit_id | pr_id | oshpd_id2 | location_id | p_id | emp_id | procedure_name | procedure_type | procedure_price | health_facility_name | ... | health_facility_county | health_facility_license_type | patient_name | patient_dob | employee_name | employee_section | employee_specialty | visit_date | visit_payment | visit_rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202112-MW-541162616589965 | XXXXXXXXSUb0197 | 106304409 | 131 | JS-3698083986821021718 | S-YN-757042735246647 | Surgery_Urinary | surgery | 22500 | KAISER FND HOSP - ORANGE COUNTY - ANAHEIM | ... | ORANGE | Hospital | James Sherman | 1910-04-01 | Casey Dixon | Surgery | Plastic Surgeon | 2021-12-16 | 19910 | 10 |
| 1 | 202210-MK-875761717606062 | XXXXXXXXDBfdb95 | 306334578 | 118 | TW-20197766639107713897 | N-YS-124998365841895 | Dx_Births | diagnosis | 3000 | INDIO SURGERY CENTER INC. | ... | RIVERSIDE | Clinic | Taylor Wiggins | 2019-04-14 | Emily Sparks | Neonatology | Nurse | 2022-10-29 | 2359 | 2 |
| 2 | 202110-TL-38065298499193 | XXXXXXXESI9b76d | 106331194 | 107 | VJ-69585936808875046769 | S-NZ-211513632612704 | EC_Self_Inflicted | injury | 2500 | HEMET VALLEY MEDICAL CENTER | ... | RIVERSIDE | Hospital | Victoria Jackson | 1959-12-11 | Jordan Valdez | Surgery | Endocrinologist Surgeon | 2021-10-06 | 3635 | 5 |
3 rows × 21 columns
2.1) Patient Payments
data_used = sample_data.groupby(['p_id', 'health_facility_license_type']).mean()[['visit_payment']].reset_index()
data_used['p_id'] = data_used.index
sns.scatterplot(data=data_used, x=data_used['p_id'], y=data_used['visit_payment'],
hue="health_facility_license_type", alpha=0.2)
plt.show()
C:\Users\modaj\AppData\Local\Temp\ipykernel_636828\422786003.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. data_used = sample_data.groupby(['p_id', 'health_facility_license_type']).mean()[['visit_payment']].reset_index()
INSIGHT: most patient payments don't go over 60K per procedure
2.2) Patient Procedures
sns.histplot(data=sample_data, y='procedure_type', hue='health_facility_license_type')
plt.show()
INSIGHT: diagnosis and injury treatments are the most common procedures conducted for patients.
2.3) Patient Ratings
sns.histplot(data=sample_data, x='visit_rating', multiple='dodge',
discrete=True, hue='procedure_type', shrink=.8)
plt.show()
INSIGHT: diagnosis and injury treatments are the most rated procedures.
2.4) Health Facilities
sample_data['visit_date'] = pd.to_datetime(sample_data['visit_date'])
sample_data['visit_year'] = sample_data['visit_date'].dt.year
sea = sns.FacetGrid(sample_data, col="visit_year", hue="health_facility_license_type")
sea.map(sns.histplot, "health_facility_license_type", alpha = .8)
sea.add_legend()
plt.show()
INSIGHT: Hospital have much higher visit rates
2.4) Employee Sections
data_used = sample_data.groupby(["employee_section"]).count()[["employee_name"]].\
rename(columns={"employee_name": "count"}).reset_index()
font = {'size' : 5}
plt.rc('font', **font)
plt.pie(data_used['count'], labels=data_used['employee_section'], autopct='%.0f%%')
plt.show()
INSIGHT: Surgery is usually the biggest department for most health facilities, both in terms of covering number of procedures and employee capacity
3.1) Patient Visits - General Ratings
data_used = ddf.groupby(['visit_rating', 'health_facility_license_type']).count().iloc[:, :1]\
.reset_index().rename(columns={"visit_id": "count"}).compute()
fig = px.bar(data_used, x="visit_rating", y=data_used['count'].unique().tolist(),
color="health_facility_license_type", text_auto=True, color_discrete_sequence=px.colors.qualitative.Prism,
title="Visit Ratings - Rating Counts - Selected Facilities")
fig.update_layout(
xaxis_type='category'
)
# fig.update_xaxes(type='category')
fig.show()
INSIGHT: (all facilities) Ratings generally vary from low to mid to high ratings, almost on a uniform distribution, for all health facilities
3.2) Hospital Visits Over Time
data_used = ddf.groupby(['visit_date']).count().iloc[:, :1].\
reset_index().rename(columns={"visit_id": "visit_count"}).compute()
fig = px.line(data_used, x='visit_date', y="visit_count", color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
INSIGHT: no pattern showing for visits, meaning patient visits aren't dictated by time of the the year
3.3) Hospital Visits by Procedure
import plotly.express as px
data_used = ddf.groupby(['procedure_type']).count()[["visit_id"]].\
reset_index().rename(columns={"visit_id": "visit_count"}).sort_values(["visit_count"]).compute()
fig = px.bar(data_used, x='visit_count', y='procedure_type', text_auto=True, color='visit_count')
fig.show()
INSIGHT: most visits are based on patient injuries and diagnosis for various illnesses, people rarely solely for finding medicine. as there are pharmacies based outside of hospitals.
INSIGHT: most visits are based on patient injuries and diagnosis for various illnesses, people rarely solely for finding medicine. as there are pharmacies based outside of hospitals
3.4) 10 Most Expensive Procedures
data_used = ddf[["procedure_name", "visit_payment"]].groupby("procedure_name")[["visit_payment"]].sum().\
reset_index().sort_values(['visit_payment'], ascending=False).compute()[:10]
fig = px.pie(data_used, values='visit_payment', names='procedure_name', title='Most Expensive Procedures')
fig.show()
INSIGHT: most expensive procedures and payment are based on surgeries and fatal injuries